First, let’s check if “DESeq2” package is installed. And then, load the package.

library(AnnotationDbi)
library(org.Hs.eg.db)
library(DESeq2)
library(ggplot2)

1. Processing the data

The computational analysis of an RNA-seq experiment begins from the FASTQ files that contain the nucleotide sequence of each read and a quality score at each position. These reads must first be aligned to a reference genome or transcriptome. The output of this alignment step is commonly stored in a file format called SAM/BAM. This is the workflow we followed last day.

Lets perform some exploratory differential gene expression analysis. Note: this analysis is for demonstration only. NEVER do differential expression analysis this way!

First, load the data.

counts = read.csv("airway_scaledcounts.csv", stringsAsFactors = FALSE,row.names = 1)
metadata = read.csv("airway_metadata.csv", stringsAsFactors = FALSE)
rmarkdown::paged_table(counts)
rmarkdown::paged_table(metadata)

Subset the metadata into two data frames.

control = metadata[metadata$dex=="control",]
treated = metadata[metadata$dex=="treated",]
rmarkdown::paged_table(control)
rmarkdown::paged_table(treated)

Determine the mean count values for all genes accross control experiments

control.mean = rowSums(counts[,control$id])/length(counts[,control$id])
treated.mean = rowSums(counts[,treated$id])/length(counts[,treated$id])

Let’s store the control.mean and treated.mean together for ease of use

meancounts = data.frame(control.mean, treated.mean)
rmarkdown::paged_table(meancounts)
colSums(meancounts)
## control.mean treated.mean 
##     23005324     22196524

Plot the control vs treated.

plot(meancounts[,1:2],xlab="Control",ylab="Treated")

We see a few dozen dots outside of the big clump around the origin. So, let’s try plotting both axes on a log scale.

plot(meancounts[,1:2],log="xy",xlab="log Control",ylab="log Treated")
## Warning in xy.coords(x, y, xlabel, ylabel, log): 15032 x values <= 0
## omitted from logarithmic plot
## Warning in xy.coords(x, y, xlabel, ylabel, log): 15281 y values <= 0
## omitted from logarithmic plot

Here we calculate log2foldchange.

meancounts$log2fc = log2(meancounts[,"treated.mean"]/meancounts[,"control.mean"])

We will now remove the NaN and -Inf values. The NaN is returned when you divide by zero and try to take the log. The -Inf is returned when you try to take the log of zero.

zero.vals = which(meancounts[,1:2]==0, arr.ind=TRUE)
to.rm = unique(zero.vals[,1])
meancounts = meancounts[-to.rm,]
rmarkdown::paged_table(meancounts)

How many genes are up in the drug treated cells and how many are down?

up.ind = meancounts$log2fc > 2
down.ind = meancounts$log2fc < (-2)
sum(up.ind)
## [1] 250
sum(down.ind)
## [1] 367

2. Annotation

We can add annotation from a supplied CSV file, such as those available from ENSEMBLE or UCSC. The annotables_grch38.csv annotation table links the unambiguous Ensembl gene ID to other useful annotation like the gene symbol, full gene name, location, Entrez gene ID, etc.

anno = read.csv("annotables_grch38.csv")

Use the merge funtion to add the annotation data from the “anno” object to our RNA-Seq results in “meancounts”

# use the merge function
meancounts.anno = merge(meancounts, anno, by.x="row.names", by.y="ensgene")
rmarkdown::paged_table(meancounts.anno)

We can use the mapIds() function to add individual columns to our results table.

columns(org.Hs.eg.db)
##  [1] "ACCNUM"       "ALIAS"        "ENSEMBL"      "ENSEMBLPROT" 
##  [5] "ENSEMBLTRANS" "ENTREZID"     "ENZYME"       "EVIDENCE"    
##  [9] "EVIDENCEALL"  "GENENAME"     "GO"           "GOALL"       
## [13] "IPI"          "MAP"          "OMIM"         "ONTOLOGY"    
## [17] "ONTOLOGYALL"  "PATH"         "PFAM"         "PMID"        
## [21] "PROSITE"      "REFSEQ"       "SYMBOL"       "UCSCKG"      
## [25] "UNIGENE"      "UNIPROT"
meancounts$symbol <- mapIds(org.Hs.eg.db,
                     keys=row.names(meancounts),
                     column="SYMBOL",
                     keytype="ENSEMBL",
                     multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
meancounts$entrez <- mapIds(org.Hs.eg.db,
                     keys=row.names(meancounts),
                     column="ENTREZID",
                     keytype="ENSEMBL",
                     multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
meancounts$uniprot <- mapIds(org.Hs.eg.db,
                     keys=row.names(meancounts),
                     column="UNIPROT",
                     keytype="ENSEMBL",
                     multiVals="first")
## 'select()' returned 1:many mapping between keys and columns
rmarkdown::paged_table(meancounts[up.ind,])
rmarkdown::paged_table(meancounts[down.ind,])

3. Differential gene expression

DESeq is Differential gene expression analysis based on the negative binomial distribution. Setup the object needed for DESeq analysis

counts <- read.csv("airway_scaledcounts.csv", stringsAsFactors = FALSE)
dds = DESeqDataSetFromMatrix(countData=counts, 
                             colData=metadata, 
                             design=~dex, 
                             tidy=TRUE)
## converting counts to integer mode
## Warning in DESeqDataSet(se, design = design, ignoreRank): some variables in
## design formula are characters, converting to factors
dds = DESeq(dds)
## estimating size factors
## estimating dispersions
## gene-wise dispersion estimates
## mean-dispersion relationship
## final dispersion estimates
## fitting model and testing
res = results(dds)
summary(res)
## 
## out of 25258 with nonzero total read count
## adjusted p-value < 0.1
## LFC > 0 (up)       : 1563, 6.2%
## LFC < 0 (down)     : 1188, 4.7%
## outliers [1]       : 142, 0.56%
## low counts [2]     : 9971, 39%
## (mean count < 10)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results
rmarkdown::paged_table(as.data.frame(res[order(res$pvalue),]))

Change the p-value threshold again to 0.01

res01 = results(dds, alpha=0.01)
summary(res01)
## 
## out of 25258 with nonzero total read count
## adjusted p-value < 0.01
## LFC > 0 (up)       : 850, 3.4%
## LFC < 0 (down)     : 581, 2.3%
## outliers [1]       : 142, 0.56%
## low counts [2]     : 9033, 36%
## (mean count < 6)
## [1] see 'cooksCutoff' argument of ?results
## [2] see 'independentFiltering' argument of ?results

Store our results as a data.frame object, sort or order our results by the adjusted p-value

resSig01 = subset(as.data.frame(res), padj < 0.01)
rmarkdown::paged_table(resSig01[order(resSig01$padj),])

4. Data visualization

Let’s first see what the gene ID is for the CRISPLD2 gene using:

resSig01[grep("CRISPLD2", resSig01$symbol),]
## [1] baseMean       log2FoldChange lfcSE          stat          
## [5] pvalue         padj          
## <0 rows> (or 0-length row.names)
rownames(resSig01[grep("CRISPLD2", resSig01$symbol),])
## character(0)

We can mow use this returned object to plot a boxplot with the base graphics function

d = plotCounts(dds, gene="ENSG00000103196", intgroup="dex", returnData=TRUE)
head(d)
##                count     dex
## SRR1039508  774.5002 control
## SRR1039509 6258.7915 treated
## SRR1039512 1100.2741 control
## SRR1039513 6093.0324 treated
## SRR1039516  736.9483 control
## SRR1039517 2742.1908 treated
boxplot(count ~ dex , data=d)

Or use ggplot2

ggplot(d, aes(dex, count)) + 
geom_boxplot(aes(fill=dex)) + 
scale_y_log10() + 
ggtitle("CRISPLD2")

Volcano plot

# Setup our custom point color vector 
mycols = rep("gray", nrow(res))
mycols[ res$padj < 0.01 ]  <- "red" 
mycols[ abs(res$log2FoldChange) > 2 ]  <- "blue"
mycols[ (res$padj < 0.01) & (abs(res$log2FoldChange) > 2 ) ] <- "green"

palette( c("gray","blue") )
plot( res$log2FoldChange,  -log(res$padj), col=mycols, ylab="-Log(P-value)", xlab="Log2(FoldChange)")

# Add some cut-off lines
abline(v=c(-2,2), col="darkgray", lty=2)
abline(h=-log(0.1), col="darkgray", lty=2)

Or use ggplot2

ggplot(as.data.frame(res), aes(log2FoldChange, -log10(pvalue), col=mycols)) + 
geom_point() + 
ggtitle("Volcano plot")
## Warning: Removed 13578 rows containing missing values (geom_point).